1 Preparing the environment for script execution

  1. Download maxent.jar and place it in the same folder as this document
  2. Download and install R
  3. Download and install RTools 3.5
  4. Download and install RStudio
  5. Install and load the devtools package (run the commands using the RStudio console)
    • install.packages("devtools")
    • library(devtools)
  6. Install the sdm package and the packages it depends on (run the commands using the RStudio console)
    • install.packages("sdm")
    • library(sdm)
    • installAll()
  7. Install all other required packages (run commands using RStudio console)
    • install.packages(c("raster", "rgdal", "sf", "rgeos", "ggplot2", "cowplot", "plotly", "mapview", "ggfortify", "scales", "factoextra ", "ggcorrplot", "prettyGraphs", "Rtsne", "DT", "parallel", "snow", "sdm", "FactoMineR", "rdist", "usdm", "dplyr", "tidyr", "purrr", "purrrlyr", "here", "fs", "stringr", "exactextractr", "gdalUtils", "dismo", "rgbif", "boot", "httr"))

2 Downloading data:

2.1 Downloading rasters

First we create a folder in our working directory to include input data.

# Create folder to store inputs.
if(!dir_exists('input_data')){
  dir_create('input_data')
}

Now let’s download data from WorldClim 2.1. Note that when R downloads a file it has a timeout of 60 seconds. This may not be enough to download environmental data, so we can set options(timeout=n), where n is the number of seconds we need to download the data.

# This option allows us to control how much time do we need to download the data. If R takes more than 10 minutes (600 seconds) to download the data it will stop the download. Increase the timeout if needed.
options(timeout=600)

# Current data
# Files are automatically saved in input_data folder.
WorldClim_data('current', variable = 'bioc', resolution = 10)
## [1] "The file for current scenario is already downloaded."
# Future data
gcms <- c('cc', 'gg', 'mr', 'uk')
WorldClim_data('future', variable = 'bioc', year = '2090', gcm = gcms, ssp = c('245'), resolution = 10) 
## [1] "The file for future scenario (input_data/WorldClim_data_future/cc_ssp245_10m_2090.tif) is already downloaded."
## [1] "The file for future scenario (input_data/WorldClim_data_future/gg_ssp245_10m_2090.tif) is already downloaded."
## [1] "The file for future scenario (input_data/WorldClim_data_future/mr_ssp245_10m_2090.tif) is already downloaded."
## [1] "The file for future scenario (input_data/WorldClim_data_future/uk_ssp245_10m_2090.tif) is already downloaded."

2.2 Obtaining occurrence data from GBIF:

# Downloading data from GBIF
# File is automatically saved in input_data folder
# spp_data <- GBIF_data('Colossoma macropomum')

2.3 Downloading shapefile for study area

# Obtaining Natural Earth data:
#shape_study_area <- rnaturalearth::ne_download(scale = 50, type = "rivers_lake_centerlines", category = "physical")

3 Geoprocessing:

3.1 Open Files and Data

Firstly, we name inputs and outputs, caring for using the correct extensions. a) Inputs:

# Shapefile (polygon or lines) delimiting study area.
shape_study_area_file <- here("input_data/shape_study_area/AmazonHydroRivers4.shp")  

# Directory name containing current rasters to be rescaled.
folder_current_rasters <- here("input_data/WorldClim_data_current")

# Directory name containing future rasters to be rescaled.
folder_future_rasters <- here("input_data/WorldClim_data_future")
  1. Outputs:
# Name of shapefile (.shp) for the study area to be saved.
output_shp_study_area <- here("output_data/grid/Colossoma_grid.shp")

# Name of R object (.rds) where the current rescaled variables will be saved.
output_shp_current <- here("output_data/WorldClim_data_current_rescaled/Colossoma_current.rds")

# Set scenarios names:
scenarios <- apply(expand.grid(gcms, c("ssp245"),"10", 2090), 1, paste, collapse="_")

# Name of R object (.rds) where the future rescaled variables will be saved.
output_shp_future <- here(paste0("output_data/WorldClim_data_future_rescaled/",
                                 scenarios,
                                 ".rds"))
  1. Seting up some important variables:
# Cell hight and width for the grid.
# This value depends on rasters projection. If rasters are in UTM, values will be in meters. If rasters are in decimal degrees (as in WorldClim 2.1), values will be in degrees. However, note that the function make_grid (used to build the grid above study area) has an argument called epsg where we can reproject spatial data. The epsg of study area is further transmitted to predictor variables. This means that even if WorldClim 2.1 is projected in decimal degrees we should address cell sizes in the desired epsg.

# Following values build a grid with 100 x 100 km.
epsg = 6933
cell_width = 100000
cell_height = 100000
# Note that setting those values to build a grid smaller than the input rasters may generate NaNs, causing some problems.

# If you have any variable in shape_study_area that you want to keep for rescaling, you can set here.
# Set the correct names of variables using as much as 10 characters.
# Setting the names to list() will use none of the variables in the shape, while setting it to NULL will use all variables.
names_var_shp_study_area <-  c("NEXT_DOWN", "MAIN_RIV", "LENGTH_KM", 
                               "DIST_DN_KM", "DIST_UP_KM", "CATCH_SKM", "UPLAND_SKM", 
                               "DIS_AV_CMS", "ORD_STRA", "ORD_CLAS", "ORD_FLOW")
raster_vars <- paste0('bio_', 1:19)

# As in the codeline above, here we set which variables in current rasters we want to keep for rescaling.
# Set the correct names of variables using as much as 10 characters.
# Setting the names to list() will use none of the variables in the shape, while setting it to NULL will use all variables.
current_var_names <- c(names_var_shp_study_area, raster_vars) # or NULL

# As in the codelines above, here we set which variables in future rasters we want to keep for rescaling.
# We will usually need at least the same variables as in the current scenario for projection.
# Set the correct names of variables using as much as 10 characters.
# Setting the names to list() will use none of the variables in the shape, while setting it to NULL will use all variables.
future_var_names <-  current_var_names 

3.2 Study Area

The map of study area needs to be imported to R, so we can create a grid for the study area. This grid will be used for model building and projections.

shape_study_area <- shape_study_area_file %>%
  st_read() %>%
  repair_shp()
## Reading layer `AmazonHydroRivers4' from data source 
##   `/Users/luizesser/Documents/GitHub/scriptsdm/input_data/shape_study_area/AmazonHydroRivers4.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 107294 features and 14 fields
## Geometry type: LINESTRING
## Dimension:     XY
## Bounding box:  xmin: -79.37708 ymin: -20.21042 xmax: -48.44375 ymax: 5.139583
## Geodetic CRS:  WGS 84
if (output_shp_study_area %>% file_exists()== F){
  grid_study_area <- shape_study_area %>% 
      make_grid(cell_width, cell_height, names_var_shp_study_area, epsg=epsg) # target EPSG
  
  output_shp_study_area %>% 
    path_dir() %>% 
    dir_create()
  
  grid_study_area %>% st_write(
      dsn = output_shp_study_area)
} else {
  grid_study_area <- output_shp_study_area %>% st_read()
}
## Reading layer `Colossoma_grid' from data source 
##   `/Users/luizesser/Documents/GitHub/scriptsdm/output_data/grid/Colossoma_grid.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 642 features and 14 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -7600000 ymin: -2500000 xmax: -4600000 ymax: 7e+05
## Projected CRS: WGS 84 / NSIDC EASE-Grid 2.0 Global

3.3 Rescaling variables

The next step aims to cross data from study area with rasters of variables. We will start with current data.

### Rescaling current data
if (output_shp_current %>% file_exists() == F) {
  grid_current <- grid_study_area %>% 
    add_raster(folder_current_rasters, raster_vars) 
    
  output_shp_current %>% 
    path_dir() %>% 
    dir_create()
  
  grid_current %>% saveRDS(output_shp_current)
} else {
  grid_current <- output_shp_current %>% readRDS()
}

Now within a loop to rescale future variables.

### Rescaling future data
if (!all(output_shp_future %>% file_exists())) {
  for (i in 1:length(scenarios)) {

      grid_future <- grid_study_area %>% 
       add_raster(folder_future_rasters, future_var_names, scenarios[i]) 
      
      output_shp_future %>% 
       path_dir() %>% 
       dir_create()
      
      grid_future %>% saveRDS(output_shp_future[grep(scenarios[i], output_shp_future)])
    
  }
}

grid_future <- lapply(output_shp_future,function(x){readRDS(x)})
names(grid_future) <- scenarios

print(grid_future[[1]])
## Simple feature collection with 642 features and 33 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -7600000 ymin: -2500000 xmax: -4600000 ymax: 7e+05
## Projected CRS: WGS 84 / NSIDC EASE-Grid 2.0 Global
## First 10 features:
##    cell_id next_down main_riv length_km dist_dn_km dist_up_km catch_skm
## 1       14  61289429 60443230  4.293571   4552.214   75.19286  16.57214
## 2       15  61283896 60443230  4.080200   4425.778  204.43800  16.11340
## 3       16  61287342 60443230  7.184286   4180.364  268.33929  25.34786
## 4       42  61269036 60443230  4.506000   4856.160   56.50000  19.38600
## 5       43  61267896 60443230  4.825312   4768.006  166.03281  20.73984
## 6       44  61270528 60443230  4.130400   4565.809  271.66000  16.76870
## 7       45  61276281 60443230  3.645634   4357.465  458.27042  13.64366
## 8       46  61272984 60443230  5.298833   4139.190  420.98000  18.32750
## 9       47  61266914 60443230  6.261935   3980.697  185.00968  30.28581
## 10      48  61266777 60443230  4.982143   3972.986   87.49286  25.37643
##    upland_skm dis_av_cms ord_stra ord_clas ord_flow x_centroid y_centroid
## 1    1393.529  3.1836429 4.000000 3.000000 6.000000   -6250000   -2450000
## 2   10805.804 23.1558000 4.320000 2.840000 5.720000   -6150000   -2450000
## 3    6758.746  8.8958929 4.000000 3.571429 5.392857   -6050000   -2450000
## 4    1299.480  2.5990000 4.000000 4.000000 6.000000   -6450000   -2350000
## 5    7113.725 17.9444687 4.812500 3.031250 5.312500   -6350000   -2350000
## 6   16758.634 44.0094300 5.130000 2.500000 5.290000   -6250000   -2350000
## 7   38211.137 82.4315775 5.436620 2.281690 4.816901   -6150000   -2350000
## 8   26306.177 41.4814667 4.900000 3.116667 5.183333   -6050000   -2350000
## 9    5719.913  7.2719355 4.354839 4.548387 6.322581   -5950000   -2350000
## 10   1870.321  0.2140714 4.285714 5.714286 8.357143   -5850000   -2350000
##       bio_1    bio_2    bio_3    bio_4    bio_5     bio_6    bio_7    bio_8
## 1  20.24684 13.84176 68.54555 204.1144 29.70086  9.527322 20.17353 20.38651
## 2  26.55916 12.37412 59.97032 292.1918 36.66457 16.007487 20.65708 27.52043
## 3  29.18168 13.49217 58.02394 341.9572 40.75657 17.496642 23.25993 30.51738
## 4  12.79734 16.66331 67.75534 253.9190 23.42181 -1.163224 24.58503 14.64541
## 5  16.63256 16.14285 69.75631 210.2901 26.97781  3.841557 23.13626 16.96295
## 6  22.87325 13.98092 67.31177 216.3198 32.78920 12.011138 20.77806 22.89831
## 7  26.41511 12.70065 62.58097 262.6446 36.53996 16.217002 20.32296 26.91273
## 8  29.69522 13.21537 58.52635 320.3550 41.18508 18.604747 22.58034 30.59864
## 9  30.29418 13.18291 55.77331 333.2756 42.30888 18.682555 23.62632 30.98534
## 10 30.72369 13.52517 56.58782 318.4834 42.81487 18.929925 23.88494 30.94970
##        bio_9   bio_10    bio_11   bio_12   bio_13    bio_14    bio_15   bio_16
## 1  17.073420 22.64973 17.458527 574.2856 132.4124  2.353883  97.12121 344.7287
## 2  24.139089 30.01422 22.671924 661.7264 141.1422  5.997128  87.11182 375.7356
## 3  26.611829 33.43858 24.843783 538.5381 111.9852  3.509868  86.21881 301.7986
## 4   8.646783 15.30873  9.223421 378.9018 106.2629  1.331266 111.78573 251.1362
## 5  13.716360 19.08248 13.811794 530.0330 144.3075  1.773711 106.48774 338.8569
## 6  19.962216 25.56179 20.015558 533.9380 131.0196  2.410201 104.74298 340.2003
## 7  24.445456 29.70589 23.050371 681.6485 147.9890  7.481844  85.50568 381.8590
## 8  27.253710 33.83632 25.686743 629.1914 116.2168  8.831161  76.68114 324.2497
## 9  27.885419 34.74869 26.172898 626.2308 114.5745 11.187795  72.26914 311.2446
## 10 28.678516 35.03932 26.795579 711.6006 123.6856 15.287990  66.94596 338.6468
##       bio_17   bio_18    bio_19                       geometry
## 1  11.353190 168.4213 12.093882 POLYGON ((-6300000 -2500000...
## 2  24.672886 170.3395 40.618524 POLYGON ((-6200000 -2500000...
## 3  16.088348 136.3927 37.042820 POLYGON ((-6100000 -2500000...
## 4   7.090657 208.8186  8.074054 POLYGON ((-6500000 -2400000...
## 5   7.903049 148.0404 11.308218 POLYGON ((-6400000 -2400000...
## 6  10.573087 139.7532 12.505996 POLYGON ((-6300000 -2400000...
## 7  30.472545 170.0650 52.635803 POLYGON ((-6200000 -2400000...
## 8  31.415579 170.2667 56.973362 POLYGON ((-6100000 -2400000...
## 9  37.018825 178.1825 65.417373 POLYGON ((-6e+06 -2400000, ...
## 10 49.273950 205.9026 84.190579 POLYGON ((-5900000 -2400000...

4 Occurrence data

4.1 Open files with data

It is necessary to name the output. Be extra careful with extension names. a) Input:

spp_data <- here('input_data/spp_data.csv')
  1. Output:
#  Set the path to the output shapefile, which will contain the presence/absence matrix.
spp_output <- here("output_data/spp_pa.shp")

It is also necessary to set some other important parameters.

# Species names to be used in the study. 
# Names should be identical from input/spp_data.csv obtained previously.
# Setting this to NULL will use all species.
#spp_names <- especie # or NULL
spp_names <- c("Colossoma.macropomum")

4.2 Importing occurrence data

occ_shp <- spp_data %>% 
  occurrences_to_shapefile(spp_names, grid_study_area)


mapview(grid_study_area[,1], alpha.regions = 0, color = "red", lwd = 1, layer.name = "Study Area", legend=NULL) +
  mapview(occ_shp, zcol = "species", layer.name = "Species") 

4.3 Occurrence grid for the Study Area

We will say to the grid_study_area which cells have an occurrence record for the studied species.

  spp_names_abv <- spp_names %>% 
    to_snake_case() %>% 
    abbreviate(minlength = 10) %>% 
    as.vector()

if (spp_output %>% file_exists() == F) {
  grid_matrix_pa <- occ_shp %>% 
    occurrences_to_pa_shapefile(grid_study_area, spp_names)
  
  spp_output %>% 
    path_dir() %>% 
    dir_create()
  
  grid_matrix_pa %<>% select(all_of(spp_names_abv))
  
  grid_matrix_pa %>% st_write(dsn = spp_output)
} else {
  grid_matrix_pa <- spp_output %>% st_read()
}
## Reading layer `spp_pa' from data source 
##   `/Users/luizesser/Documents/GitHub/scriptsdm/output_data/spp_pa.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 642 features and 1 field
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -7600000 ymin: -2500000 xmax: -4600000 ymax: 7e+05
## Projected CRS: WGS 84 / NSIDC EASE-Grid 2.0 Global
grid_matrix_pa %>% 
  as.data.frame() %>% 
  select(all_of(spp_names_abv)) %>% 
  rowSums() %>% 
  as.vector() %>% 
  richness_map(., grid_study_area)

Check how many records there is to each species:

presences_number(grid_matrix_pa, spp_names)

5 Variable Selection - VIF

Variable Selection must be done OR with a VIF routine, OR with a PCA. To perform a VIF routine, we will use only the environmental data from the occurrence data.

# Obtain occurrence data.frame and environmental variables:
df_vif <- get_df_vif(grid_current, grid_matrix_pa)
## var_names not informed. Variables detected:  next_down, main_riv, length_km, dist_dn_km, dist_up_km, catch_skm, upland_skm, dis_av_cms, ord_stra, ord_clas, ord_flow, bio_1, bio_2, bio_3, bio_4, bio_5, bio_6, bio_7, bio_8, bio_9, bio_10, bio_11, bio_12, bio_13, bio_14, bio_15, bio_16, bio_17, bio_18, bio_19sp_names not informed. Detected species:  clssm_mcrp
# Run VIF routine from usdm package:
vif_bio <- lapply(df_vif, function(x){vifcor(x,th=0.5)})
vif_bio[[1]]
## 23 variables from the 30 input variables have collinearity problem: 
##  
## bio_17 bio_16 upland_skm catch_skm bio_15 bio_7 bio_11 bio_6 bio_1 ord_stra bio_12 bio_19 bio_3 dist_up_km bio_2 bio_10 bio_9 bio_14 bio_18 next_down length_km dis_av_cms ord_clas 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( bio_4 ~ main_riv ):  -0.005198821 
## max correlation ( bio_4 ~ ord_flow ):  0.3962631 
## 
## ---------- VIFs of the remained variables -------- 
##    Variables      VIF
## 1   main_riv 1.074130
## 2 dist_dn_km 1.387095
## 3   ord_flow 1.367601
## 4      bio_4 1.453938
## 5      bio_5 1.260669
## 6      bio_8 1.293997
## 7     bio_13 1.186953
# We can exclude variables with high VIF or go to the next step.
grid_current <-  sapply(names(df_vif), function(x){select(grid_current, !vif_bio[[x]]@excluded)}, USE.NAMES = T)

6 Variable Selection - PCA

  1. Outputs:
# Name of the shapefile in which the PCA values will be stored.
shp_pca <- here("output_data/pca/shp_pca.rds")

# Name of the shapefile in which current PCA projection will be stored.
shp_preds <- here("output_data/pca/shp_preds.rds")

# Name of the shapefile in which future PCA projection will be stored.
shp_preds_future <- here("output_data/pca/shp_preds_future.rds")
  1. Transforming objects to be used in the next steps:
shp_matrix_pa <- grid_matrix_pa

df_species <- shp_matrix_pa %>% 
  as.data.frame() %>%
  select(-c('geometry'))

df_var_preditors <- output_shp_current %>%
  get_predictors_as_df()

6.1 Control Variables

# Names of variables to be normalized (centered and scaled).
# Normalization can improve the modelling, the calculation of PCAs and the clusterization algorithms used to generate pseudoabsences.
var_normalization <- tolower(current_var_names) # OR: paste0("bio_",1:19)

# Names of variables to be used in PCA.
var_pca_bio <- var_normalization # OR: paste0("bio_",1:19)

# Number of PCA-axes to be retained.
nr_comp_pca_bio <- 4

# Names of variables to be used as predictors when training the models. It can be environmental variables or PCA-axes.
var_predictors <-c("dim_1_bio","dim_2_bio","dim_3_bio","dim_4_bio")

6.2 Preparing Predictor Variables

6.2.1 Standardizing and normalizing

df_potential_predictors <- df_species %>%
  bind_cols(df_var_preditors)

df_potential_predictors <- df_potential_predictors %>% 
  center_scale(var_normalization)

df_potential_predictors %>% 
  head() %>% 
  round(4) %>% 
  datatable(options = list(pageLength = 10, scrollX=T))

6.3 Preparing Variables for PCA

df_var_pca <- df_potential_predictors %>% 
  select(var_pca_bio[which(var_pca_bio %in% colnames(df_potential_predictors))])

df_var_pca %>%
  head() %>% 
  round(4) %>% 
  datatable(options = list(pageLength = 10, scrollX=T))

6.3.1 PCA analysis for variables.

6.3.1.1 Correlation Matrix

df_var_pca %>% 
  corr_plot()
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

6.3.1.2 PCA analysis

The table that follows shows the values of the axes and variables. A tabela a seguir mostra os valores dos eixos e variaveis.

if (shp_pca %>% file_exists() == T) {
  file_delete(shp_pca)
}
pca_bio <- df_var_pca %>% 
  calc_pca(nr_comp_pca_bio, "bio")

pca_bio$var$loadings %>% 
  round(4) %>% 
  datatable(options = list(pageLength = 10, scrollX=T))

Save pca with study area:

if(!dir_exists('output_data/pca/')){
  dir_create('output_data/pca/')
}

grid_study_area %>% 
  bind_cols(pca_bio$ind$coord %>% as.data.frame()) %>% 
  saveRDS(file = shp_pca)

Summary of dimensions. The following table shows the correlation between variables with axes (i.e. the significance from each variable to given axle).

pca_bio %>% 
  dt_pca_summ()

6.3.1.3 Variable Contribution

pca_bio %>% 
  contrib_scree()

pca_bio %>% 
  contrib_corr()

pca_bio %>% 
  contrib_dims()

6.3.1.4 Quality of Variables’ Representation

PCA-axes considering the quality of variables’ representation.

pca_bio %>% 
  pca_cos2()

Quality of variables’ representation on axes.

pca_bio %>% 
  cos2_dims()

pca_bio %>% 
  cos2_corr()

pca_bio %>% 
  pca_bi_plot(df_species %>% rowMeans() %>% ceiling())

pca_bio %>% 
  comp_pca_retain()
## $broken.stick
## [1] 4
## 
## $kaiser.mean
## [1] 7
## 
## $horn.p
## [1] 5

6.4 Generating Shapefile with Predictors

6.4.1 Jointing PCA to variables

df_potential_predictors <- df_potential_predictors %>%
  bind_cols(pca_bio$ind$coord %>% as.data.frame()) 

df_potential_predictors %>% 
  head() %>% 
  round(4) %>%
  datatable(options = list(pageLength = 10, scrollX=T))
if (shp_preds %>% file_exists() == F){
  df_var_preditors <- df_potential_predictors %>% 
    select(var_predictors %>% all_of())
  
  grid_study_area %>% 
    select(cell_id) %>% 
    bind_cols(df_var_preditors) %>% 
    saveRDS(file = shp_preds)
}

6.5 Projecting PCA-axes to the future scenarios.

pca_future <- proj_pca(pca_bio, grid_future)
pca_future[[1]]

pca_future %>% 
    saveRDS(file = shp_preds_future)

7 Training Models

7.1 Set Data

As in previous steps, it is necessary to set inputs and outputs, taking extra care with extension names. a) Input: If you want to use PCA:

# To use PCA-axis:
df_var_preditors <- df_potential_predictors[,colnames(df_potential_predictors) %in% var_predictors] # PCA
grid_future <- pca_future # PCA
names(grid_future) <- scenarios
grid_future[[1]]

If you want to use VIF or raw predictors:

# To use raw predictors (VIF):
df_var_preditors <- output_shp_current %>%
  get_predictors_as_df() %>% select(-c('cell_id'))
#df_var_preditors <- grid_current %>% as.data.frame() %>% select(-c('geometry', 'cell_id', 'y_centroid', 'x_centroid')) # raw predictors
grid_future <- lapply(output_shp_future,function(x){readRDS(x)}) # raw predictors
names(grid_future) <- scenarios
grid_future[[1]]
  1. Outputs:
# Name the directory to save trained models.
folder_models <- here("output_data/models")
  1. Control Variables:
# Algorithm names to be used in Species Distribution Modeling.
# Run getmethodNames() to unveil which algorithms are available.
algo <- c("rbf", "svm", "mda")

# Set the threshold criteria.
# 1:sp=se, 2:max(se+sp), 3:min(cost), 4:minROCdist, 5:max(kappa), 6:max(ppv+npv), 7:ppv=npv, 8:max(NMI), 9:max(ccr), 10: prevalence
thresh_criteria <- 2

# Number of runs to each algorithm
n_run <- 10

# Number of folds to crossvalidation
n_folds <- 4

# Number of pseudoabsence sets
n_pa <- 1

7.2 Generate Pseudoabsences

To build models, it is necessary to use pseudoabsences that contrast to presences. Currently, only the ‘random’ method is applied.

df_pseudoabsences <- shp_matrix_pa %>%
  pseudoabsences(df_var_preditors, spp_names, method="envelope", folder_models) 

It is possible to plot a t-SNE graph to check whether pseudoabsence data clusters into a separate group from presence data.

tsne_list <- df_potential_predictors %>% 
  tsne_plot(df_pseudoabsences, spp_names)
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(sp)
## 
##   # Now:
##   data %>% select(all_of(sp))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
tsne_list
## [[1]]

7.3 Join data

As we are using the sdm package, let’s start to build our models by indicating our input data.

# For PCA routine:
for(s in colnames(df_species)){
  df_pseudoabsences[[s]] <- df_pseudoabsences[[s]][,var_predictors]
}
# For VIF routine:
for(s in colnames(df_species)){
  df_pseudoabsences[[s]] <- exclude(df_pseudoabsences[[s]], vif_bio[[s]])
}
d <- df_species %>% 
  fit_data(df_var_preditors, df_pseudoabsences)
## Loading required package: dismo
## Loading required package: gbm
## Loaded gbm 2.1.8.1
## Loading required package: tree
## Loading required package: mda
## Loading required package: class
## Loaded mda 0.5-3
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## The following object is masked from 'package:usdm':
## 
##     Variogram
## The following object is masked from 'package:raster':
## 
##     getData
## This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
## Loading required package: glmnet
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-6
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## 
## Attaching package: 'plotrix'
## The following object is masked from 'package:scales':
## 
##     rescale
## Loading required package: TeachingDemos
## 
## Attaching package: 'TeachingDemos'
## The following object is masked from 'package:plotly':
## 
##     subplot
## Loading required package: rJava
## Loading required package: RSNNS
## Loading required package: Rcpp
## Loading required package: ranger
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ranger':
## 
##     importance
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## Loading required package: rpart
## Loading required package: kernlab
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:scales':
## 
##     alpha
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## The following objects are masked from 'package:raster':
## 
##     buffer, rotated
d[[1]]
## class                                 : sdmdata 
## =========================================================== 
## number of species                     :  1 
## species names                         :  clssm_mcrp 
## number of features                    :  7 
## feature names                         :  main_riv, dist_dn_km, ord_flow, ... 
## type                                  :  Presence-Background 
## has independet test data?             :  FALSE 
## number of records                     :  194 
## has Coordinates?                      :  FALSE

7.4 Training Models

With the data, we can build our models.

df_species %>%
  train_models_to_folder(
      d, 
      algo, 
      n_run, 
      n_folds, 
      folder_models
    )
## 
## The variable importance for all the models are combined (averaged)...
folder_models %>%
  dir_ls() %>%
  path_file() %>% 
  head() %>%
  as.data.frame()
"Number of trained species: " %>%
  paste0( 
    folder_models %>%
      dir_ls() %>% 
      length()
  ) %>%
  print()
## [1] "Number of trained species: 1"

How many models failed?

d %>% 
  model_failures(folder_models)

7.5 Model Selection and Threshold visualization

spp_names <- colnames(df_species)

thresholds_models <- spp_names %>% 
  sp_thresh_from_folder(folder_models, thresh_criteria)

thresholds_models_means <- spp_names %>%  
  validation_metrics_from_folder(folder_models, thresholds_models)

model_selection <- spp_names %>%  
  validation_metrics_from_folder(folder_models, thresholds_models, stats = 'AUC', th = 0.8)

thresholds_models_means[[1]]

8 Projections

To project our models in space, we need to set where the models were saved (previously set as the folder_models object) and where we want to save our projections.

directory_projections <- here("output_data/projections")

Set up some variables.

df_pa <- shp_matrix_pa %>% 
  as.data.frame() %>%
  select(-c('geometry'))

df_potential_predictors <- df_pa %>% 
  bind_cols(df_var_preditors)

projection_data <- lapply(grid_future, function(x){ x <- as.data.frame(x)
                                           x[!(names(x) %in% c("x_centroid", "y_centroid", "geometry"))]})
projection_data$current <- df_var_preditors

And finally run our projections.

# Project models in scenarios
df_pa %>% predict_to_folder(scenarios_list=projection_data,
                              models_folder=folder_models, 
                              pred_methods=model_selection, 
                              thr_criteria=thresh_criteria, 
                              output_folder=directory_projections,
                              thresholds_models_means=thresholds_models_means
    )

9 Visualizing Results

9.1 Obtain predictions

predictions_sp <- sapply(spp_names, function(x){sp_predictions_from_folder(x,directory_projections)},simplify=F, USE.NAMES = T)
pred_means <- predictions_means(predictions_sp, c(scenarios, 'current'))
ensembles <- gcm_ensemble(pred_means, ssp=c('current', 'ssp245'))

# Output ensembles
for (i in 1:length(ensembles)) {
  write.csv(ensembles[[i]], paste0(directory_projections,'/',names(ensembles)[i],'.csv'))
}

9.2 Frequence map in current scenario

ensemble_map(ensembles$current$current_freq_mean, grid_study_area, "Current", 'Frequence')

9.3 Presence map in current scenario

ensemble_map(ensembles$current$current_pa_mean, grid_study_area, "Current", 'Presence')